Visualizing scientific results

MACS 40700 University of Chicago

Tables or graphs

Smoothing lines

Smoothing lines

p +
  geom_smooth(method = "lm", se = FALSE)

Smoothing lines

p +
  geom_smooth(se = FALSE)

Smoothing lines

Smoothing lines

Smoothing lines

Multiple smoothing lines

library(gapminder)

ggplot(data = gapminder,
       mapping = aes(x = log(gdpPercap), y = lifeExp)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "lm") +
  geom_smooth(method = "lm", formula = y ~ splines::bs(x, df = 3)) +
  geom_smooth(method = "loess")

Multiple smoothing lines

model_colors <- RColorBrewer::brewer.pal(3, "Set1")
model_colors
## [1] "#E41A1C" "#377EB8" "#4DAF4A"
ggplot(data = gapminder,
       mapping = aes(x = log(gdpPercap), y = lifeExp)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "lm", aes(color = "OLS", fill = "OLS")) +
  geom_smooth(method = "lm", formula = y ~ splines::bs(x, df = 3),
              aes(color = "Cubic Spline", fill = "Cubic Spline")) +
  geom_smooth(method = "loess",
              aes(color = "LOESS", fill = "LOESS")) +
  scale_color_manual(name = "Models", values = model_colors) +
  scale_fill_manual(name = "Models", values = model_colors) +
  theme(legend.position = "top")

Scatterplot matricies

credit <- read_csv("data/Credit.csv") %>%
  # remove first ID column
  select(-X1)
names(credit) <- stringr::str_to_lower(names(credit))   # convert column names to lowercase
glimpse(credit)
## Observations: 400
## Variables: 11
## $ income    <dbl> 14.9, 106.0, 104.6, 148.9, 55.9, 80.2, 21.0, 71.4, 1...
## $ limit     <int> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114, 3300...
## $ rating    <int> 283, 483, 514, 681, 357, 569, 259, 512, 266, 491, 58...
## $ cards     <int> 2, 3, 4, 3, 2, 4, 2, 2, 5, 3, 4, 3, 1, 1, 2, 3, 3, 3...
## $ age       <int> 34, 82, 71, 36, 68, 77, 37, 87, 66, 41, 30, 64, 57, ...
## $ education <int> 11, 15, 11, 11, 16, 10, 12, 9, 13, 19, 14, 16, 7, 9,...
## $ gender    <chr> "Male", "Female", "Male", "Female", "Male", "Male", ...
## $ student   <chr> "No", "Yes", "No", "No", "No", "No", "No", "No", "No...
## $ married   <chr> "Yes", "Yes", "No", "No", "Yes", "No", "No", "No", "...
## $ ethnicity <chr> "Caucasian", "Asian", "Asian", "Asian", "Caucasian",...
## $ balance   <int> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1350, ...

Scatterplot matricies

pairs(select_if(credit, is.numeric))

Scatterplot matricies

library(GGally)

ggpairs(select_if(credit, is.numeric))

Scatterplot matricies

ggpairs(credit, mapping = aes(color = gender),
        columns = c("income", "limit", "rating", "cards", "age", "education", "balance"))

Scatterplot matricies

ggpairs(select_if(credit, is.numeric),
        lower = list(
          continuous = "smooth"
        )
)

Scatterplot matricies

ggpairs(select_if(credit, is.numeric),
        lower = list(
          continuous = wrap("smooth", alpha = .1, color = "blue")
        )
)

Scatterplot matricies

scatter_smooth <- function(data, mapping, ...) {
  ggplot(data = data, mapping = mapping) +
    # make data points transparent
    geom_point(alpha = .2) +
    # add default smoother
    geom_smooth(se = FALSE)
}

ggpairs(select_if(credit, is.numeric),
        lower = list(
          continuous = scatter_smooth
        )
)

ggpairs(credit, mapping = aes(color = gender),
        columns = c("income", "limit", "rating", "cards", "age", "education", "balance"),
        lower = list(
          continuous = scatter_smooth
        )
)

Scatterplot matricies

rcfss::scorecard %>%
  select(type:debt) %>%
  ggpairs

Heatmap of correlation coefficients

(mpg_lite <- select_if(mpg, is.numeric))
## # A tibble: 234 x 5
##    displ  year   cyl   cty   hwy
##    <dbl> <int> <int> <int> <int>
##  1  1.80  1999     4    18    29
##  2  1.80  1999     4    21    29
##  3  2.00  2008     4    20    31
##  4  2.00  2008     4    21    30
##  5  2.80  1999     6    16    26
##  6  2.80  1999     6    18    26
##  7  3.10  2008     6    18    27
##  8  1.80  1999     4    18    26
##  9  1.80  1999     4    16    25
## 10  2.00  2008     4    20    28
## # ... with 224 more rows
(cormat <- mpg_lite %>%
  cor %>%
  round(2))
##       displ  year   cyl   cty   hwy
## displ  1.00  0.15  0.93 -0.80 -0.77
## year   0.15  1.00  0.12 -0.04  0.00
## cyl    0.93  0.12  1.00 -0.81 -0.76
## cty   -0.80 -0.04 -0.81  1.00  0.96
## hwy   -0.77  0.00 -0.76  0.96  1.00

Heatmap of correlation coefficients

library(reshape2)
(melted_cormat <- melt(cormat))
##     Var1  Var2 value
## 1  displ displ  1.00
## 2   year displ  0.15
## 3    cyl displ  0.93
## 4    cty displ -0.80
## 5    hwy displ -0.77
## 6  displ  year  0.15
## 7   year  year  1.00
## 8    cyl  year  0.12
## 9    cty  year -0.04
## 10   hwy  year  0.00
## 11 displ   cyl  0.93
## 12  year   cyl  0.12
## 13   cyl   cyl  1.00
## 14   cty   cyl -0.81
## 15   hwy   cyl -0.76
## 16 displ   cty -0.80
## 17  year   cty -0.04
## 18   cyl   cty -0.81
## 19   cty   cty  1.00
## 20   hwy   cty  0.96
## 21 displ   hwy -0.77
## 22  year   hwy  0.00
## 23   cyl   hwy -0.76
## 24   cty   hwy  0.96
## 25   hwy   hwy  1.00

Heatmap of correlation coefficients

ggplot(melted_cormat, aes(x = Var1, y = Var2, fill = value)) + 
  geom_tile()

Heatmap of correlation coefficients

# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
  cormat[upper.tri(cormat)] <- NA
  return(cormat)
}

# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
  cormat[lower.tri(cormat)]<- NA
  return(cormat)
}

upper_tri <- get_upper_tri(cormat)
upper_tri
##       displ year  cyl   cty   hwy
## displ     1 0.15 0.93 -0.80 -0.77
## year     NA 1.00 0.12 -0.04  0.00
## cyl      NA   NA 1.00 -0.81 -0.76
## cty      NA   NA   NA  1.00  0.96
## hwy      NA   NA   NA    NA  1.00

Heatmap of correlation coefficients

melted_cormat <- melt(upper_tri, na.rm = TRUE)

ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white") +
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1)) +
 coord_fixed()

Heatmap of correlation coefficients

reorder_cormat <- function(cormat){
  # Use correlation between variables as distance
  dd <- as.dist((1-cormat)/2)
  hc <- hclust(dd)
  cormat <-cormat[hc$order, hc$order]
}

# Reorder the correlation matrix
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)

# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)

# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()

# Print the heatmap
print(ggheatmap)

Heatmap of correlation coefficients

ggheatmap + 
  geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    legend.position = "bottom")

Heatmap of correlation coefficients

cormat_heatmap <- function(data){
  # generate correlation matrix
  cormat <- round(cor(data), 2)
  
  # melt into a tidy table
  get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
  }
  
  upper_tri <- get_upper_tri(cormat)
  
  # reorder matrix based on coefficient value
  reorder_cormat <- function(cormat){
    # Use correlation between variables as distance
    dd <- as.dist((1-cormat)/2)
    hc <- hclust(dd)
    cormat <-cormat[hc$order, hc$order]
  }
  
  cormat <- reorder_cormat(cormat)
  upper_tri <- get_upper_tri(cormat)
  
  # Melt the correlation matrix
  melted_cormat <- melt(upper_tri, na.rm = TRUE)
  
  # Create a ggheatmap
  ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
    geom_tile(color = "white")+
    scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                         midpoint = 0, limit = c(-1,1), space = "Lab", 
                         name="Pearson\nCorrelation") +
    theme_minimal()+ # minimal theme
    theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                     size = 12, hjust = 1))+
    coord_fixed()
  
  # add correlation values to graph
  ggheatmap + 
    geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
    theme(
      axis.title.x = element_blank(),
      axis.title.y = element_blank(),
      panel.grid.major = element_blank(),
      panel.border = element_blank(),
      panel.background = element_blank(),
      axis.ticks = element_blank(),
      legend.position = "bottom")
}

cormat_heatmap(select_if(mpg, is.numeric))

cormat_heatmap(select_if(credit, is.numeric))

cormat_heatmap(select_if(diamonds, is.numeric))

Parallel coordinate plots

Three-dimensional plots

ggplot(mpg, aes(displ, hwy, color = class)) +
  geom_point()

Three-dimensional plots

# import data
(vote <- rcfss::mental_health)
## # A tibble: 1,317 x 5
##    vote96   age  educ female mhealth
##     <dbl> <dbl> <dbl>  <dbl>   <dbl>
##  1     1.   60.   12.     0.      0.
##  2     1.   36.   12.     0.      1.
##  3     0.   21.   13.     0.      7.
##  4     0.   29.   13.     0.      6.
##  5     1.   39.   18.     1.      2.
##  6     1.   41.   15.     1.      1.
##  7     1.   48.   20.     0.      2.
##  8     0.   20.   12.     1.      9.
##  9     0.   27.   11.     1.      9.
## 10     0.   34.    7.     1.      2.
## # ... with 1,307 more rows
# estimate model
vote_glm <- glm(vote96 ~ age + educ, data = vote, family = "binomial")
tidy(vote_glm)
##          term estimate std.error statistic  p.value
## 1 (Intercept)  -5.0244   0.44482     -11.3 1.38e-29
## 2         age   0.0469   0.00441      10.6 1.94e-26
## 3        educ   0.2816   0.02629      10.7 9.32e-27
# extract predicted probabilities
vote_prob <- vote %>%
  data_grid(age = 18:89, educ = 0:20) %>%
  modelr::add_predictions(vote_glm) %>%
  # convert predicted values to probabilities
  mutate(prob = rcfss::logit2prob(pred))

ggplot(vote_prob, aes(age, educ, fill = prob)) +
  geom_tile() +
  scale_fill_gradient2(midpoint = .5, label = scales::percent) +
  labs(title = "Probability of voter turnout in 1996",
       x = "Age",
       y = "Education (in years)",
       fill = "Probability\nof voting")

# cleaner image using geom_raster and interpolate = TRUE
ggplot(vote_prob, aes(age, educ, fill = prob)) +
  geom_raster(interpolate = TRUE) +
  scale_fill_gradient2(midpoint = .5, label = scales::percent) +
  labs(title = "Probability of voter turnout in 1996",
       x = "Age",
       y = "Education (in years)",
       fill = "Probability\nof voting")

plotly

plot_ly(vote_prob, x = ~age, y = ~educ, z = ~prob) %>%
  add_mesh()

plotly

plot_ly(credit, x = ~limit, y = ~balance, z = ~income) %>%
  add_mesh()

plotly

plot_ly(z = ~volcano) %>% add_surface()
volcano %>%
  melt %>%
  ggplot(aes(Var1, Var2, z = value)) +
  geom_contour(aes(color = ..level..))

Joint distribution of sex and happiness

# contingency table
library(descr)
crosstab(happy$happy, happy$sex, prop.t = TRUE, plot = FALSE)
##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |           Total Percent | 
## |-------------------------|
## 
## =======================================
##                  happy$sex
## happy$happy       male   female   Total
## ---------------------------------------
## not too happy    1904     2460    4364 
##                   5.5%     7.1%        
## ---------------------------------------
## pretty happy     8760    10539   19299 
##                  25.2%    30.3%        
## ---------------------------------------
## very happy       4833     6327   11160 
##                  13.9%    18.2%        
## ---------------------------------------
## Total           15497    19326   34823 
## =======================================

Conditional distribution of sex given happiness

crosstab(happy$happy, happy$sex, prop.r = TRUE, plot = FALSE)
##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |             Row Percent | 
## |-------------------------|
## 
## =======================================
##                  happy$sex
## happy$happy       male   female   Total
## ---------------------------------------
## not too happy    1904     2460    4364 
##                  43.6%    56.4%   12.5%
## ---------------------------------------
## pretty happy     8760    10539   19299 
##                  45.4%    54.6%   55.4%
## ---------------------------------------
## very happy       4833     6327   11160 
##                  43.3%    56.7%   32.0%
## ---------------------------------------
## Total           15497    19326   34823 
## =======================================
crosstab(happy$happy, happy$sex, prop.c = TRUE, plot = FALSE)
##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## =======================================
##                  happy$sex
## happy$happy       male   female   Total
## ---------------------------------------
## not too happy    1904     2460    4364 
##                  12.3%    12.7%        
## ---------------------------------------
## pretty happy     8760    10539   19299 
##                  56.5%    54.5%        
## ---------------------------------------
## very happy       4833     6327   11160 
##                  31.2%    32.7%        
## ---------------------------------------
## Total           15497    19326   34823 
##                  44.5%    55.5%        
## =======================================

Conditional distribution of happiness given sex and marginal distribution of sex

crosstab(happy$happy, happy$sex, prop.c = TRUE, prop.r = TRUE, plot = FALSE)
##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |             Row Percent | 
## |          Column Percent | 
## |-------------------------|
## 
## =======================================
##                  happy$sex
## happy$happy       male   female   Total
## ---------------------------------------
## not too happy    1904     2460    4364 
##                  43.6%    56.4%   12.5%
##                  12.3%    12.7%        
## ---------------------------------------
## pretty happy     8760    10539   19299 
##                  45.4%    54.6%   55.4%
##                  56.5%    54.5%        
## ---------------------------------------
## very happy       4833     6327   11160 
##                  43.3%    56.7%   32.0%
##                  31.2%    32.7%        
## ---------------------------------------
## Total           15497    19326   34823 
##                  44.5%    55.5%        
## =======================================

graphics::mosaicplot()

mosaicplot(~ sex + happy, data = happy)

vcd::mosaic()

library(vcd)
mosaic(~ happy + sex, happy)

productplots::prodplot()

# mosaic plot using productplots
prodplot(happy, ~ happy + sex)

# add color
prodplot(happy, ~ happy + sex) +
  aes(fill = happy) +
  theme(panel.grid = element_blank())

prodplot(happy, ~ happy + marital) +
  aes(fill = happy) +
  theme(legend.position = "none") +
  theme(panel.grid = element_blank())

productplots::prodplot()

ggplot(happy, aes(marital, fill = happy)) +
  geom_bar(position = "fill")

Dot plot for summary statistics

library(ISLR)

OJ_sum <- OJ %>%
  select(ends_with("MM"), ends_with("CH")) %>%
  gather(var, value) %>%
  group_by(var) %>%
  summarize(mean = mean(value),
            sd = sd(value),
            min = min(value),
            max = max(value),
            n = n())

# print the table
kable(OJ_sum)
var mean sd min max n
DiscCH 0.052 0.117 0.00 0.500 1070
DiscMM 0.123 0.214 0.00 0.800 1070
LoyalCH 0.566 0.308 0.00 1.000 1070
PctDiscCH 0.027 0.062 0.00 0.253 1070
PctDiscMM 0.059 0.102 0.00 0.402 1070
PriceCH 1.867 0.102 1.69 2.090 1070
PriceMM 2.085 0.134 1.69 2.290 1070
SalePriceCH 1.816 0.143 1.39 2.090 1070
SalePriceMM 1.962 0.253 1.19 2.290 1070
SpecialCH 0.148 0.355 0.00 1.000 1070
SpecialMM 0.162 0.368 0.00 1.000 1070
# plot using a single dot plot
ggplot(OJ_sum, aes(x = fct_reorder(var, mean), y = mean)) +
  geom_linerange(aes(ymin = mean - 2 * sd,
                      ymax = mean + 2 * sd),
                  linetype = 2,
                 size = .25) +
  geom_linerange(aes(ymin = mean - sd,
                      ymax = mean + sd),
                  size = 1) +
  geom_point() +
  coord_flip() +
  labs(x = NULL,
       y = NULL)

# dodge based on OJ brand
OJ_sum %>%
  separate(var, into = c("var", "brand"), -3, remove = TRUE) %>%
  ggplot(aes(x = fct_reorder(var, mean), y = mean, color = brand)) +
  geom_linerange(aes(ymin = mean - 2 * sd,
                      ymax = mean + 2 * sd),
                  linetype = 2,
                 size = .25,
                 position = position_dodge(width = 0.5)) +
  geom_linerange(aes(ymin = mean - sd,
                      ymax = mean + sd),
                  size = 1,
                 position = position_dodge(width = 0.5)) +
  geom_point(position = position_dodge(width = 0.5)) +
  coord_flip() +
  labs(x = NULL,
       y = NULL,
       color = "Brand")

# facet based on OJ brand
OJ_sum %>%
  separate(var, into = c("var", "brand"), -3, remove = TRUE) %>%
  ggplot(aes(x = fct_reorder(var, mean), y = mean)) +
  facet_grid(. ~ brand) +
  geom_linerange(aes(ymin = mean - 2 * sd,
                      ymax = mean + 2 * sd),
                  linetype = 2,
                 size = .25) +
  geom_linerange(aes(ymin = mean - sd,
                      ymax = mean + sd),
                  size = 1) +
  geom_point() +
  coord_flip() +
  labs(x = NULL,
       y = NULL,
       color = "Brand")

Visualizing model results

  • Present your findings in substantive terms
  • Show your degree of confidence
  • Show your data when you can

Extracting model contents

gapminder
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # ... with 1,694 more rows

Extracting model contents

gapminder %>% 
  ggplot(aes(year, lifeExp, group = country)) +
    geom_line(alpha = 1/3)

Extracting model contents

gapminder_mod <- lm(lifeExp ~ year, data = gapminder)
summary(gapminder_mod)
## 
## Call:
## lm(formula = lifeExp ~ year, data = gapminder)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -39.95  -9.65   1.70  10.33  22.16 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -585.6522    32.3140   -18.1   <2e-16 ***
## year           0.3259     0.0163    20.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.6 on 1702 degrees of freedom
## Multiple R-squared:  0.19,   Adjusted R-squared:  0.189 
## F-statistic:  399 on 1 and 1702 DF,  p-value: <2e-16
grid <- gapminder %>% 
  data_grid(year, country) 
grid
## # A tibble: 1,704 x 2
##     year country    
##    <int> <fct>      
##  1  1952 Afghanistan
##  2  1952 Albania    
##  3  1952 Algeria    
##  4  1952 Angola     
##  5  1952 Argentina  
##  6  1952 Australia  
##  7  1952 Austria    
##  8  1952 Bahrain    
##  9  1952 Bangladesh 
## 10  1952 Belgium    
## # ... with 1,694 more rows
grid <- grid %>% 
  add_predictions(gapminder_mod) 
grid
## # A tibble: 1,704 x 3
##     year country      pred
##    <int> <fct>       <dbl>
##  1  1952 Afghanistan  50.5
##  2  1952 Albania      50.5
##  3  1952 Algeria      50.5
##  4  1952 Angola       50.5
##  5  1952 Argentina    50.5
##  6  1952 Australia    50.5
##  7  1952 Austria      50.5
##  8  1952 Bahrain      50.5
##  9  1952 Bangladesh   50.5
## 10  1952 Belgium      50.5
## # ... with 1,694 more rows
ggplot(gapminder, aes(year, group = country)) +
  geom_line(aes(y = lifeExp), alpha = .2) +
  geom_line(aes(y = pred), data = grid, color = "red", size = 1)

Extracting model contents

str(gapminder_mod)
## List of 12
##  $ coefficients : Named num [1:2] -585.652 0.326
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "year"
##  $ residuals    : Named num [1:1704] -21.7 -21.8 -21.8 -21.4 -20.9 ...
##   ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:1704] -2455.1 232.2 -20.8 -20.5 -20.2 ...
##   ..- attr(*, "names")= chr [1:1704] "(Intercept)" "year" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:1704] 50.5 52.1 53.8 55.4 57 ...
##   ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:1704, 1:2] -41.2795 0.0242 0.0242 0.0242 0.0242 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:1704] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "year"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.02 1.03
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 1702
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = lifeExp ~ year, data = gapminder)
##  $ terms        :Classes 'terms', 'formula'  language lifeExp ~ year
##   .. ..- attr(*, "variables")= language list(lifeExp, year)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "lifeExp" "year"
##   .. .. .. ..$ : chr "year"
##   .. ..- attr(*, "term.labels")= chr "year"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(lifeExp, year)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "lifeExp" "year"
##  $ model        :'data.frame':   1704 obs. of  2 variables:
##   ..$ lifeExp: num [1:1704] 28.8 30.3 32 34 36.1 ...
##   ..$ year   : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language lifeExp ~ year
##   .. .. ..- attr(*, "variables")= language list(lifeExp, year)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "lifeExp" "year"
##   .. .. .. .. ..$ : chr "year"
##   .. .. ..- attr(*, "term.labels")= chr "year"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(lifeExp, year)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "lifeExp" "year"
##  - attr(*, "class")= chr "lm"

broom::tidy()

tidy(gapminder_mod)
##          term estimate std.error statistic  p.value
## 1 (Intercept) -585.652   32.3140     -18.1 2.90e-67
## 2        year    0.326    0.0163      20.0 7.55e-80
tidy(gapminder_mod) %>%
  str()
## 'data.frame':    2 obs. of  5 variables:
##  $ term     : chr  "(Intercept)" "year"
##  $ estimate : num  -585.652 0.326
##  $ std.error: num  32.314 0.0163
##  $ statistic: num  -18.1 20
##  $ p.value  : num  2.90e-67 7.55e-80

broom::augment()

augment(gapminder_mod) %>%
  as_tibble()
## # A tibble: 1,704 x 9
##    lifeExp  year .fitted .se.fit .resid     .hat .sigma .cooksd .std.resid
##      <dbl> <int>   <dbl>   <dbl>  <dbl>    <dbl>  <dbl>   <dbl>      <dbl>
##  1    28.8  1952    50.5   0.530  -21.7 0.00208    11.6 3.63e-3      -1.87
##  2    30.3  1957    52.1   0.463  -21.8 0.00158    11.6 2.79e-3      -1.88
##  3    32.0  1962    53.8   0.401  -21.8 0.00119    11.6 2.09e-3      -1.87
##  4    34.0  1967    55.4   0.348  -21.4 0.000895   11.6 1.51e-3      -1.84
##  5    36.1  1972    57.0   0.307  -20.9 0.000698   11.6 1.13e-3      -1.80
##  6    38.4  1977    58.7   0.285  -20.2 0.000599   11.6 9.07e-4      -1.74
##  7    39.9  1982    60.3   0.285  -20.4 0.000599   11.6 9.26e-4      -1.76
##  8    40.8  1987    61.9   0.307  -21.1 0.000698   11.6 1.15e-3      -1.81
##  9    41.7  1992    63.5   0.348  -21.9 0.000895   11.6 1.59e-3      -1.88
## 10    41.8  1997    65.2   0.401  -23.4 0.00119    11.6 2.42e-3      -2.01
## # ... with 1,694 more rows

broom::glance()

glance(gapminder_mod)
##   r.squared adj.r.squared sigma statistic  p.value df logLik   AIC   BIC
## 1      0.19         0.189  11.6       399 7.55e-80  2  -6598 13202 13218
##   deviance df.residual
## 1   230229        1702

stargazer

##          term estimate std.error statistic  p.value
## 1 (Intercept) -585.652   32.3140     -18.1 2.90e-67
## 2        year    0.326    0.0163      20.0 7.55e-80

stargazer

library(stargazer)

stargazer(gapminder_mod, type = "html")
Dependent variable:
lifeExp
year 0.326***
(0.016)
Constant -586.000***
(32.300)
Observations 1,704
R2 0.190
Adjusted R2 0.189
Residual Std. Error 11.600 (df = 1702)
F Statistic 399.000*** (df = 1; 1702)
Note: p<0.1; p<0.05; p<0.01

stargazer

gapminder_yr <- lm(lifeExp ~ year, data = gapminder)
gapminder_gdp <- lm(lifeExp ~ gdpPercap, data = gapminder)
gapminder_yr_gdp <- lm(lifeExp ~ year + gdpPercap, data = gapminder)
gapminder_gdp_year_lifeexp <- lm(gdpPercap ~ year + lifeExp, data = gapminder)

stargazer(gapminder_yr, gapminder_gdp, gapminder_yr_gdp, gapminder_gdp_year_lifeexp,
          type = "html")
Dependent variable:
lifeExp gdpPercap
(1) (2) (3) (4)
year 0.326*** 0.239*** -19.000
(0.016) (0.014) (12.500)
gdpPercap 0.001*** 0.001***
(0.00003) (0.00002)
lifeExp 457.000***
(16.700)
Constant -586.000*** 54.000*** -418.000*** 17,658.000
(32.300) (0.315) (27.600) (24,287.000)
Observations 1,704 1,704 1,704 1,704
R2 0.190 0.341 0.437 0.342
Adjusted R2 0.189 0.340 0.437 0.341
Residual Std. Error 11.600 (df = 1702) 10.500 (df = 1702) 9.690 (df = 1701) 8,003.000 (df = 1701)
F Statistic 399.000*** (df = 1; 1702) 880.000*** (df = 1; 1702) 661.000*** (df = 2; 1701) 441.000*** (df = 2; 1701)
Note: p<0.1; p<0.05; p<0.01

stargazer

stargazer(gapminder_yr, gapminder_gdp, gapminder_yr_gdp, gapminder_gdp_year_lifeexp,
          type = "html",
          dep.var.labels = c("Life expectancy", "GDP per capita"),
          covariate.labels = c("Year", "GDP per capita", "Life expectancy"),
          omit.stat = c("ser", "f"))
Dependent variable:
Life expectancy GDP per capita
(1) (2) (3) (4)
Year 0.326*** 0.239*** -19.000
(0.016) (0.014) (12.500)
GDP per capita 0.001*** 0.001***
(0.00003) (0.00002)
Life expectancy 457.000***
(16.700)
Constant -586.000*** 54.000*** -418.000*** 17,658.000
(32.300) (0.315) (27.600) (24,287.000)
Observations 1,704 1,704 1,704 1,704
R2 0.190 0.341 0.437 0.342
Adjusted R2 0.189 0.340 0.437 0.341
Note: p<0.1; p<0.05; p<0.01

Generate predicted values

out <- lm(formula = lifeExp ~ gdpPercap + pop + continent,
          data = gapminder)
summary(out)
## 
## Call:
## lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -49.16  -4.49   0.30   5.11  25.17 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.78e+01   3.40e-01  140.82   <2e-16 ***
## gdpPercap         4.50e-04   2.35e-05   19.16   <2e-16 ***
## pop               6.57e-09   1.98e-09    3.33    9e-04 ***
## continentAmericas 1.35e+01   6.00e-01   22.46   <2e-16 ***
## continentAsia     8.19e+00   5.71e-01   14.34   <2e-16 ***
## continentEurope   1.75e+01   6.25e-01   27.97   <2e-16 ***
## continentOceania  1.81e+01   1.78e+00   10.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.37 on 1697 degrees of freedom
## Multiple R-squared:  0.582,  Adjusted R-squared:  0.581 
## F-statistic:  394 on 6 and 1697 DF,  p-value: <2e-16

Generate predicted values

min_gdp <- min(gapminder$gdpPercap)
max_gdp <- max(gapminder$gdpPercap)
med_pop <- median(gapminder$pop)

pred_df <- expand.grid(gdpPercap = (seq(from = min_gdp,
                                        to = max_gdp,
                                        length.out = 100)),
                       pop = med_pop,
                       continent = c("Africa", "Americas",
                                     "Asia", "Europe", "Oceania")) %>%
  as_tibble

pred_df
## # A tibble: 500 x 3
##    gdpPercap      pop continent
##        <dbl>    <dbl> <fct>    
##  1      241. 7023596. Africa   
##  2     1385. 7023596. Africa   
##  3     2530. 7023596. Africa   
##  4     3674. 7023596. Africa   
##  5     4818. 7023596. Africa   
##  6     5962. 7023596. Africa   
##  7     7107. 7023596. Africa   
##  8     8251. 7023596. Africa   
##  9     9395. 7023596. Africa   
## 10    10540. 7023596. Africa   
## # ... with 490 more rows

Generate predicted values

pred_out <- predict(object = out,
                    newdata = pred_df,
                    interval = "predict") %>%
  as_tibble
pred_out
## # A tibble: 500 x 3
##      fit   lwr   upr
##    <dbl> <dbl> <dbl>
##  1  48.0  31.5  64.4
##  2  48.5  32.1  64.9
##  3  49.0  32.6  65.4
##  4  49.5  33.1  65.9
##  5  50.0  33.6  66.4
##  6  50.5  34.1  67.0
##  7  51.1  34.6  67.5
##  8  51.6  35.1  68.0
##  9  52.1  35.7  68.5
## 10  52.6  36.2  69.0
## # ... with 490 more rows

Generate predicted values

pred_df <- bind_cols(pred_df, pred_out)
pred_df
## # A tibble: 500 x 6
##    gdpPercap      pop continent   fit   lwr   upr
##        <dbl>    <dbl> <fct>     <dbl> <dbl> <dbl>
##  1      241. 7023596. Africa     48.0  31.5  64.4
##  2     1385. 7023596. Africa     48.5  32.1  64.9
##  3     2530. 7023596. Africa     49.0  32.6  65.4
##  4     3674. 7023596. Africa     49.5  33.1  65.9
##  5     4818. 7023596. Africa     50.0  33.6  66.4
##  6     5962. 7023596. Africa     50.5  34.1  67.0
##  7     7107. 7023596. Africa     51.1  34.6  67.5
##  8     8251. 7023596. Africa     51.6  35.1  68.0
##  9     9395. 7023596. Africa     52.1  35.7  68.5
## 10    10540. 7023596. Africa     52.6  36.2  69.0
## # ... with 490 more rows

Generate predicted values

ggplot(data = pred_df %>%
         filter(continent %in% c("Europe", "Africa")),
       aes(x = gdpPercap,
           y = fit, ymin = lwr, ymax = upr,
           color = continent,
           fill = continent,
           group = continent)) +
  geom_point(data = gapminder %>%
               filter(continent %in% c("Europe", "Africa")),
             aes(x = gdpPercap, y = lifeExp,
                 color = continent),
             alpha = 0.5,
             inherit.aes = FALSE) + 
  geom_line() +
  geom_ribbon(alpha = 0.2, color = FALSE) +
  scale_x_log10(labels = scales::dollar)

Plot marginal effects

library(margins)
library(socviz)

glimpse(gss_sm)
## Observations: 2,867
## Variables: 32
## $ year        <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20...
## $ id          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ ballot      <dbl+lbl> 1, 2, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 3, 2, 3...
## $ age         <dbl> 47, 61, 72, 43, 55, 53, 50, 23, 45, 71, 33, 86, 32...
## $ childs      <dbl> 3, 0, 2, 4, 2, 2, 2, 3, 3, 4, 5, 4, 3, 5, 7, 2, 6,...
## $ sibs        <dbl+lbl> 2, 3, 3, 3, 2, 2, 2, 6, 5, 1, 4, 4, 3, 6, 0, 1...
## $ degree      <fct> Bachelor, High School, Bachelor, High School, Grad...
## $ race        <fct> White, White, White, White, White, White, White, O...
## $ sex         <fct> Male, Male, Male, Female, Female, Female, Male, Fe...
## $ region      <fct> New England, New England, New England, New England...
## $ income16    <fct> $170000 or over, $50000 to 59999, $75000 to $89999...
## $ relig       <fct> None, None, Catholic, Catholic, None, None, None, ...
## $ marital     <fct> Married, Never Married, Married, Married, Married,...
## $ padeg       <fct> Graduate, Lt High School, High School, NA, Bachelo...
## $ madeg       <fct> High School, High School, Lt High School, High Sch...
## $ partyid     <fct> Independent, Ind,near Dem, Not Str Republican, Not...
## $ polviews    <fct> Moderate, Liberal, Conservative, Moderate, Slightl...
## $ happy       <fct> Pretty Happy, Pretty Happy, Very Happy, Pretty Hap...
## $ partners    <fct> NA, 1 Partner, 1 Partner, NA, 1 Partner, 1 Partner...
## $ grass       <fct> NA, Legal, Not Legal, NA, Legal, Legal, NA, Not Le...
## $ zodiac      <fct> Aquarius, Scorpio, Pisces, Cancer, Scorpio, Scorpi...
## $ pres12      <dbl+lbl> 3, 1, 2, 2, 1, 1, NA, NA, NA, 2, NA, NA, 1, 1,...
## $ wtssall     <dbl> 0.957, 0.478, 0.957, 1.914, 1.435, 0.957, 1.435, 0...
## $ income_rc   <fct> Gt $170000, Gt $50000, Gt $75000, Gt $170000, Gt $...
## $ agegrp      <fct> Age 45-55, Age 55-65, Age 65+, Age 35-45, Age 45-5...
## $ ageq        <fct> Age 34-49, Age 49-62, Age 62+, Age 34-49, Age 49-6...
## $ siblings    <fct> 2, 3, 3, 3, 2, 2, 2, 6+, 5, 1, 4, 4, 3, 6+, 0, 1, ...
## $ kids        <fct> 3, 0, 2, 4+, 2, 2, 2, 3, 3, 4+, 4+, 4+, 3, 4+, 4+,...
## $ religion    <fct> None, None, Catholic, Catholic, None, None, None, ...
## $ bigregion   <fct> Northeast, Northeast, Northeast, Northeast, Northe...
## $ partners_rc <fct> NA, 1, 1, NA, 1, 1, NA, 1, NA, 3, 1, NA, 1, NA, 0,...
## $ obama       <dbl> 0, 1, 0, 0, 1, 1, NA, NA, NA, 0, NA, NA, 1, 1, 0, ...
# make moderates the reference category (i.e. 0)
gss_sm$polviews_m <- relevel(gss_sm$polviews, ref = "Moderate")

# build the model
out_bo <- glm(obama ~ polviews_m + sex*race,
              family = "binomial", data = gss_sm)
summary(out_bo)
## 
## Call:
## glm(formula = obama ~ polviews_m + sex * race, family = "binomial", 
##     data = gss_sm)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.905  -0.554   0.177   0.542   2.244  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       0.29649    0.13409    2.21   0.0270 *  
## polviews_mExtremely Liberal       2.37295    0.52504    4.52  6.2e-06 ***
## polviews_mLiberal                 2.60003    0.35667    7.29  3.1e-13 ***
## polviews_mSlightly Liberal        1.29317    0.24843    5.21  1.9e-07 ***
## polviews_mSlightly Conservative  -1.35528    0.18129   -7.48  7.7e-14 ***
## polviews_mConservative           -2.34746    0.20038  -11.71  < 2e-16 ***
## polviews_mExtremely Conservative -2.72738    0.38721   -7.04  1.9e-12 ***
## sexFemale                         0.25487    0.14537    1.75   0.0796 .  
## raceBlack                         3.84953    0.50132    7.68  1.6e-14 ***
## raceOther                        -0.00214    0.43576    0.00   0.9961    
## sexFemale:raceBlack              -0.19751    0.66007   -0.30   0.7648    
## sexFemale:raceOther               1.57483    0.58766    2.68   0.0074 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2247.9  on 1697  degrees of freedom
## Residual deviance: 1345.9  on 1686  degrees of freedom
##   (1169 observations deleted due to missingness)
## AIC: 1370
## 
## Number of Fisher Scoring iterations: 6

Plot marginal effects

bo_m <- margins(out_bo)
summary(bo_m)
##                            factor     AME     SE        z      p   lower
##            polviews_mConservative -0.4119 0.0283 -14.5394 0.0000 -0.4674
##  polviews_mExtremely Conservative -0.4538 0.0420 -10.7971 0.0000 -0.5361
##       polviews_mExtremely Liberal  0.2681 0.0295   9.0996 0.0000  0.2103
##                 polviews_mLiberal  0.2768 0.0229  12.0736 0.0000  0.2319
##   polviews_mSlightly Conservative -0.2658 0.0330  -8.0596 0.0000 -0.3304
##        polviews_mSlightly Liberal  0.1933 0.0303   6.3896 0.0000  0.1340
##                         raceBlack  0.4032 0.0173  23.3568 0.0000  0.3694
##                         raceOther  0.1247 0.0386   3.2297 0.0012  0.0490
##                         sexFemale  0.0443 0.0177   2.5073 0.0122  0.0097
##    upper
##  -0.3564
##  -0.3714
##   0.3258
##   0.3218
##  -0.2011
##   0.2526
##   0.4371
##   0.2005
##   0.0789

Plot marginal effects

bo_gg <- as_tibble(summary(bo_m))
prefixes <- c("polviews_m", "sex")
bo_gg$factor <- prefix_strip(bo_gg$factor, prefixes)
bo_gg$factor <- prefix_replace(bo_gg$factor, "race", "Race: ")

bo_gg %>%
  select(factor, AME, lower, upper) 
## # A tibble: 9 x 4
##   factor                     AME    lower   upper
## * <chr>                    <dbl>    <dbl>   <dbl>
## 1 Conservative           -0.412  -0.467   -0.356 
## 2 Extremely Conservative -0.454  -0.536   -0.371 
## 3 Extremely Liberal       0.268   0.210    0.326 
## 4 Liberal                 0.277   0.232    0.322 
## 5 Slightly Conservative  -0.266  -0.330   -0.201 
## 6 Slightly Liberal        0.193   0.134    0.253 
## 7 Race: Black             0.403   0.369    0.437 
## 8 Race: Other             0.125   0.0490   0.200 
## 9 Female                  0.0443  0.00967  0.0789
ggplot(data = bo_gg, aes(x = reorder(factor, AME),
                         y = AME, ymin = lower, ymax = upper)) +
  geom_hline(yintercept = 0, color = "gray80") +
  geom_pointrange() + coord_flip() +
  labs(x = NULL, y = "Average Marginal Effect")